# delimit ;
set more 1;
clear all;

* This STATA do-file loads, cleans and transforms the 2010 U.S. survey from the Osaka Preference Parameter Study;
* After getting access to the data (see the README file in replication package), the provided data files should contain a STATA data file called ``2010_USA.dta'';
* This file should have (approximately) 7046 rows (i.e. individual observations) and 475 columns (i.e. demographics and survey question responses);

* Set upper level directory and load data;
global upper_dir = "C:\Users\ejv5165\OneDrive - The Pennsylvania State University\Research\DFVW\replication package"; * adjust as needed;

* Leave the other directories unchanged;
global osaka_dir = "$upper_dir\osaka"; 		* load original data set from here;
global data_dir = "$upper_dir\matlab\data"; * save final transformed data set from here to be used in Matlab later;
global scratch_dir = "$osaka_dir\scratch"; 	* save any user created files in this folder;

cd "$upper_dir";

use "$osaka_dir\2010_USA.dta", clear;

* The key variable for our analysis is Question q5, giving participants the choice between receiving 100 in one month or X in 13 months;
* Using this variable, we will define the annual discount rate;
* Given the `discrete' nature of the survey questions, we assume the discount rate is equal to the upper bound within each interval;

* Take a look at the distribution of all responses;
foreach var of varlist q5_* {;
tab `var';
};

* Note: assuming log log utility, we elicit each respondent's discount factor using their (approximate) indifference point: log(100) = beta * log (100+X);

gen disc_factor = .;
replace disc_factor = 1 				if q5_2 == 2; 			  * r between -0.05 and 0;
replace disc_factor = log(100)/log(102)	if q5_2 == 1 & q5_3 == 2; * r between 0 and 0.02;
replace disc_factor = log(100)/log(104)	if q5_3 == 1 & q5_4 == 2; * r between 0.02 and 0.04;
replace disc_factor = log(100)/log(106)	if q5_4 == 1 & q5_5 == 2; * r between 0.04 and 0.06;
replace disc_factor = log(100)/log(110)	if q5_5 == 1 & q5_6 == 2; * r between 0.06 and 0.10;
replace disc_factor = log(100)/log(120)	if q5_6 == 1 & q5_7 == 2; * r between 0.10 and 0.20;
replace disc_factor = log(100)/log(140)	if q5_7 == 1 & q5_8 == 2; * r between 0.20 and 0.40;
replace disc_factor = log(100)/log(180)	if q5_8 == 1 & q5_9 == 2; * r between 0.40 and 0.80;
replace disc_factor = log(100)/log(250)	if q5_9 == 1 & q5_10 == 2; * r between 0.80 and 1.50;

* recode those who always answer 2 as having r = 0;
replace disc_factor = 1 if q5_2 == 2 & q5_3 == 2 & q5_4 == 2 & q5_5 == 2 & q5_6 == 2 & q5_7 == 2 & q5_8 == 2 & q5_9 == 2 & q5_10 == 2;  * r below -0.05;
	
* recode those who always answer 1 (impossible to define their exact discount rate);
replace disc_factor = log(100)/log(300) if q5_2 == 1 & q5_3 == 1 & q5_4 == 1 & q5_5 == 1 & q5_6 == 1 & q5_7 == 1 & q5_8 == 1 & q5_9 == 1 & q5_10 == 1;

tab disc_factor;
sum disc_factor, detail;

gen disc_rate = (1-disc_factor)/disc_factor; 
* note r is now equal to (log(100+X) - log(100))/log(100);

* use other observables, i.e. age and schooling of the respondent;
gen age2010 = AGE; * respondent's age in 2010;
gen educ_cat = .;
replace educ_cat = bq6_you; * respondent's highest level of completed education;

tab educ_cat;

* Define (approximate) years of education based on the respondent's highest level of completed education;
* Note: the exact numbers don't really matter since we will discretize them later anyway into "high school or less", "some college or college", and "graduate";
gen educ_years = .;
replace educ_years = 10 if educ_cat == 1; * grade school;
replace educ_years = 11 if educ_cat == 2; * some high school;
replace educ_years = 12 if educ_cat == 3; * graduated high school;
replace educ_years = 13 if educ_cat == 4; * some college - no degree;
replace educ_years = 14 if educ_cat == 5; * graduate degree - 2 year associate's degree;
replace educ_years = 16 if educ_cat == 6; * graduated college - 4 year bachelor's degree;
replace educ_years = 17 if educ_cat == 7; * some postgraduate studies - no degree;
replace educ_years = 18 if educ_cat == 8; * masters degree;
replace educ_years = 19 if educ_cat == 9; * doctoral degree;

tab educ_cat;

cd "$scratch_dir";
save temp_osaka_2010.dta, replace;

* For our subsequent analysis in MATLAB, we keep the following variables;

keep  PANEL_ID disc_rate disc_factor age2010 educ_years;
order PANEL_ID disc_rate disc_factor age2010 educ_years;
sort PANEL_ID;

* save duplicate copy of data in scratch folder;
cd "$scratch_dir";
save osaka_2010_final.dta, replace;

* last step before saving data in matlab format;
foreach var of varlist * {;
replace `var' = -9 if `var' == .;
};

*outsheet to the data folder to be used in matlab;
cd "$data_dir";
outsheet using osaka_2010_final.txt, nonames replace; 
